suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(hyperSpec))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(plotly))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))
suppressPackageStartupMessages(library(here))
source(here("UPSCb-common/src/R/plot.multidensity.R"))
source(here("UPSCb-common/src/R/featureSelection.R"))
pal <- brewer.pal(12,"Paired")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv("~/Git/zygoticEmbryogenesis/doc/ZE_Dataset_v3.csv",
col_types = cols(col_character(),
col_character(),
col_factor(),
col_character(),
col_factor(),
col_character(),
col_double(),
col_double())) %>%
mutate(Tissue=factor(Tissue)) %>%
mutate(Time=factor(Time))
lb.filelist <- list.files(here("data/RNA-Seq/salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
Filter and select only one duplicate from each sample.
lb.filterlist <- str_subset(lb.filelist,"L001_sortmerna_trimmomatic")
#removed P11562_112 trimmomatic dataset
lb.filterlist <- (str_subset(lb.filterlist, "/mnt/picea/home/mstewart/Git/zygoticEmbryogenesis/data/RNA-Seq/salmon/P11562_112_S11_L001_sortmerna_trimmomatic/quant.sf", negate = TRUE))
#removed P11562_112 row from sample list
samples <- filter(samples, !grepl("112",NGI.ID))
#samples <- filter(samples, !grepl("P11562_201",NGI.ID)) NOT REMOVING THIS SAMPLE ANYMORE, FIXED FROM FMG TO S
#lb.filterlist <- (str_subset(lb.filterlist, "P11562_201", negate = TRUE)) NOT REMOVING THIS SAMPLE ANYMORE, FIXED FROM FMG TO S
##extract S tissue rows
##remove those rows from original
trim_samples <- subset(samples, samples$Tissue == "S")
samples <- subset(samples, samples$Tissue != "S")
samples
## # A tibble: 47 x 8
## NGI.ID User.ID Time Sample.ID Tissue Replicate Mreads X..Q30
## <chr> <chr> <fct> <chr> <fct> <chr> <dbl> <dbl>
## 1 P11562_110 PabB4C2_FMG B4 B4C2 FMG B4-FMG 13.1 91.4
## 2 P11562_111 PabB4C2_ZE B4 B4C2 ZE B4-ZE 16.6 93.0
## 3 P11562_113 PabB4C4_ZE B4 B4C4 ZE B4-ZE 17.0 93.4
## 4 P11562_114 PabB4C5_FMG B4 B4C5 FMG B4-FMG 12.7 89.9
## 5 P11562_115 PabB4C5_ZE B4 B4C5 ZE B4-ZE 12.5 91.1
## 6 P11562_116 PabB5C1_FMG B5 B5C1 FMG B5-FMG 9.46 91.8
## 7 P11562_117 PabB5C1_ZE B5 B5C1 ZE B5-ZE 15.6 92.1
## 8 P11562_118 PabB5C5_FMG B5 B5C5 FMG B5-FMG 14.1 90.2
## 9 P11562_119 PabB5C5_ZE B5 B5C5 ZE B5-ZE 13.8 90.2
## 10 P11562_120 PabB5C6_FMG B5 B5C6 FMG B5-FMG 16.4 90
## # … with 37 more rows
#trim_samples
##match NGI.IDs to filelist
##remove files
lb.filterlist_trim <- lb.filterlist
lb.filterlist_trim <- str_subset(lb.filterlist_trim, c(trim_samples$NGI.ID))
## Warning in stri_subset_regex(string, pattern, omit_na = TRUE, negate =
## negate, : longer object length is not a multiple of shorter object length
lb.filterlist_trim <- append(lb.filterlist_trim, c(str_subset(lb.filterlist, "P11562_201")))
#
#
lb.filterlist <- str_subset(lb.filterlist, c(trim_samples$NGI.ID), negate = TRUE)
## Warning in stri_subset_regex(string, pattern, omit_na = TRUE, negate =
## negate, : longer object length is not a multiple of shorter object length
lb.filterlist <- str_subset(lb.filterlist, "P11562_201", negate = TRUE)
#
#
#double_A <- trim_samples
#double_B <- trim_samples
#double_A$NGI.ID <- str_replace(double_A$NGI.ID,"P11562_","P11562_A_")
#double_B$NGI.ID <- str_replace(double_B$NGI.ID,"P11562_","P11562_B_")
#double_A$Tissue <- str_replace(double_A$Tissue,"S","FMG")
#double_B$Tissue <- str_replace(double_B$Tissue,"S","ZE")
#double_A$Replicate <- str_replace(double_A$Replicate,"S","FMG")
#double_B$Replicate <- str_replace(double_B$Replicate,"S","ZE")
#
#lb.filterlist_trim_A <- lb.filterlist_trim
#lb.filterlist_trim_B <- lb.filterlist_trim
stopifnot(all(str_which(lb.filterlist, samples$NGI.ID) == 1:length(lb.filterlist)))
##assign names to the filtered filelist (which removed L002)
names(lb.filterlist) <- samples$NGI.ID
#names(lb.filterlist_trim_A) <- double_A$NGI.ID
#names(lb.filterlist_trim_B) <- double_B$NGI.ID
lb.filterlist <- lb.filterlist[samples$Tissue %in% c("ZE","FMG")] ##Are we only looking at ZE?
#lb.filterlist_trim_A <- lb.filterlist_trim_A[double_A$Tissue %in% c("FMG")] ##Are we only looking at ZE?
#lb.filterlist_trim_B <- lb.filterlist_trim_B[double_B$Tissue %in% c("ZE")] ##Are we only looking at ZE?
#samples <- bind_rows(samples,double_A,double_B)
samples <- subset(samples, select = -c(User.ID, Sample.ID, Replicate, Mreads, X..Q30) )
#, lb.filterlist_trim_A, lb.filterlist_trim_B
Read the expression at the gene level (there is one transcript per gene)
lb.g <- suppressMessages(tximport(files = c(lb.filterlist),
type = "salmon",txOut=TRUE))
counts <- round(lb.g$counts)
Check how many genes are never expressed - reasonable level of non-expressed genes indicated.
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "21.9% percent (14484) of 66069 genes are not expressed"
The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Tissue=samples$Tissue[match(ind,samples$NGI.ID)]) %>%
mutate(Time=samples$Time[match(ind,samples$NGI.ID)])
dir.create(here("analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("analysis/salmon/ZE-ZF-unnormalised-gene-expression_data.csv"))
############## change export location name
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples,
design = ~ Time * Tissue)
## converting counts to integer mode
## factor levels were dropped which had no samples
#want to duplicate S here, rename each twin to FMG and ZE, and give them unique NGI.IDs
save(dds,file=here("analysis/salmon/ZE-ZF-Dataset-dds.rda"))
Check the size factors (i.e. the sequencing library size effect)
The sequencing depth is relatively variable (0 to 200 %) however the low end of the variance is likely due to poor samples where little sequencing data was actually produce that wasnt 16s bacterial.
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| P11562_110 | P11562_111 | P11562_113 | P11562_114 | P11562_115 | P11562_116 |
|---|---|---|---|---|---|
| 0.492 | 1.509 | 1.352 | 0.6335 | 0.9565 | 0.2825 |
| P11562_117 | P11562_118 | P11562_119 | P11562_120 | P11562_121 | P11562_123 |
|---|---|---|---|---|---|
| 1.147 | 0.526 | 1.003 | 0.6942 | 1.286 | 1.086 |
| P11562_124 | P11562_125 | P11562_126 | P11562_127 | P11562_128 | P11562_129 |
|---|---|---|---|---|---|
| 0.4092 | 1.17 | 0.5539 | 1.503 | 1.061 | 1.206 |
| P11562_130 | P11562_131 | P11562_133 | P11562_134 | P11562_135 | P11562_136 |
|---|---|---|---|---|---|
| 1.079 | 1.19 | 1.697 | 1.138 | 1.515 | 1.234 |
| P11562_137 | P11562_138 | P11562_139 | P11562_140 | P11562_141 | P11562_142 |
|---|---|---|---|---|---|
| 1.037 | 0.8039 | 1.053 | 1.017 | 1.676 | 1.535 |
| P11562_143 | P11562_145 | P11562_146 | P11562_147 | P11562_148 | P11562_149 |
|---|---|---|---|---|---|
| 0.9657 | 1.448 | 0.8732 | 0.9698 | 1.709 | 1.019 |
| P11562_150 | P11562_151 | P11562_153 | P11562_154 | P11562_155 | P11562_157 |
|---|---|---|---|---|---|
| 1.57 | 1.244 | 0.5682 | 1.044 | 1.726 | 1.196 |
| P11562_202 | P11562_203 | P11562_204 | P11562_205 | P11562_206 |
|---|---|---|---|---|
| 1.071 | 1.266 | 1.065 | 1.069 | 0.6645 |
boxplot(sizes, main="Sequencing libraries size factor")
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked very well, the data is nearly homoscesdastic
meanSdPlot(vst[rowSums(counts)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
plot(cumsum(percent))
Seems that different time points form small clusters and ZE and FMG tissue types appear to separate. These are Comp1 and Comp2 which explains the different between most of the sampels except for one B4 ZE sample. Appears to be an outlier. This seems to indicate that the Tissue and Time components explain the difference between samples. ### 2D
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
samples)
ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))
suppressPackageStartupMessages(library(plotly))
interplot <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue,text=NGI.ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(interplot, tooltip = "all") %>% layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
Filter for noise A cutoff at a VST value of 1 leaves about 32000 genes - is this adequate for the QA?
conds <- factor(paste(samples$Tissue,samples$Time))
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted
## from logarithmic plot
vstCutoff <- 4+1
heatmap.2(t(scale(t(vst[sels[[vstCutoff]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
The data appears to show a correlation between Tissue and Time, with some clustering in the PCA plot showing a movement of time in ascending order from right to left (+ to -) along the X axis. There can also be seen that along the Y axis, there are subgroups which correlate to the Tissue type. Along with this, it appears that overall gene expression does not appear starkly different but slight differences can be observed between Tissue types. It appears that the most difference between Tissues can be seen in the 1000 most variable genes, with some small changes between Time points within those Tissue groups. The final heat map of the 1000 least variable genes appear to show a very mixed expression pattern, similar in appearance to the total gene heat map, between Tissue groups and Time points, except for in the Somatic tissue type.
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] here_0.1 vsn_3.52.0
## [3] tximport_1.12.3 forcats_0.4.0
## [5] stringr_1.4.0 dplyr_0.8.3
## [7] purrr_0.3.2 readr_1.3.1
## [9] tidyr_0.8.3 tibble_2.1.3
## [11] tidyverse_1.2.1 scatterplot3d_0.3-41
## [13] RColorBrewer_1.1-2 plotly_4.9.0
## [15] pander_0.6.3 hyperSpec_0.99-20180627
## [17] ggplot2_3.2.1 lattice_0.20-38
## [19] gplots_3.0.1.1 DESeq2_1.24.0
## [21] SummarizedExperiment_1.14.1 DelayedArray_0.10.0
## [23] BiocParallel_1.18.1 matrixStats_0.54.0
## [25] Biobase_2.44.0 GenomicRanges_1.36.0
## [27] GenomeInfoDb_1.20.0 IRanges_2.18.2
## [29] S4Vectors_0.22.0 BiocGenerics_0.30.0
## [31] data.table_1.12.2
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 rprojroot_1.3-2 htmlTable_1.13.1
## [4] XVector_0.24.0 base64enc_0.1-3 rstudioapi_0.10
## [7] hexbin_1.27.3 affyio_1.54.0 bit64_0.9-7
## [10] fansi_0.4.0 AnnotationDbi_1.46.1 lubridate_1.7.4
## [13] xml2_1.2.2 splines_3.6.1 geneplotter_1.62.0
## [16] knitr_1.24 zeallot_0.1.0 Formula_1.2-3
## [19] jsonlite_1.6 broom_0.5.2 annotate_1.62.0
## [22] cluster_2.1.0 shiny_1.3.2 BiocManager_1.30.4
## [25] compiler_3.6.1 httr_1.4.1 backports_1.1.4
## [28] assertthat_0.2.1 Matrix_1.2-17 lazyeval_0.2.2
## [31] limma_3.40.6 cli_1.1.0 later_0.8.0
## [34] acepack_1.4.1 htmltools_0.3.6 tools_3.6.1
## [37] gtable_0.3.0 glue_1.3.1 GenomeInfoDbData_1.2.1
## [40] affy_1.62.0 Rcpp_1.0.2 cellranger_1.1.0
## [43] vctrs_0.2.0 preprocessCore_1.46.0 gdata_2.18.0
## [46] nlme_3.1-141 crosstalk_1.0.0 xfun_0.9
## [49] testthat_2.2.1 rvest_0.3.4 mime_0.7
## [52] gtools_3.8.1 XML_3.98-1.20 zlibbioc_1.30.0
## [55] scales_1.0.0 promises_1.0.1 hms_0.5.1
## [58] yaml_2.2.0 memoise_1.1.0 gridExtra_2.3
## [61] rpart_4.1-15 latticeExtra_0.6-28 stringi_1.4.3
## [64] RSQLite_2.1.2 highr_0.8 genefilter_1.66.0
## [67] checkmate_1.9.4 caTools_1.17.1.2 rlang_0.4.0
## [70] pkgconfig_2.0.2 bitops_1.0-6 evaluate_0.14
## [73] labeling_0.3 htmlwidgets_1.3 bit_1.1-14
## [76] tidyselect_0.2.5 magrittr_1.5 R6_2.4.0
## [79] generics_0.0.2 Hmisc_4.2-0 DBI_1.0.0
## [82] pillar_1.4.2 haven_2.1.1 foreign_0.8-72
## [85] withr_2.1.2 survival_2.44-1.1 RCurl_1.95-4.12
## [88] nnet_7.3-12 modelr_0.1.5 crayon_1.3.4
## [91] utf8_1.1.4 KernSmooth_2.23-15 rmarkdown_1.15
## [94] locfit_1.5-9.1 readxl_1.3.1 blob_1.2.0
## [97] digest_0.6.20 xtable_1.8-4 httpuv_1.5.1
## [100] munsell_0.5.0 viridisLite_0.3.0